Exploratory Data Analysis - Rainfall

1 Import Packages

pacman::p_load(tidyverse, readr, psych, st, stars, tmap, sf,
               ggstatsplot, plotly, ggplot2, ggdist, dplyr, ggiraph)

2 Load the prepared files

Let’s load the RDS files after data preparation.

rainfall <- readRDS("data/rainfall.rds")
describe(rainfall)
                 vars    n    mean     sd median trimmed    mad  min    max
Station*            1 5547   10.51   5.02   12.0   10.69   5.93    1   18.0
Region*             2 5547    3.51   1.39    4.0    3.63   1.48    1    5.0
Year                3 5547 2006.25  12.34 2010.0 2007.10  13.34 1980 2023.0
Month*              4 5547    6.52   3.45    7.0    6.52   4.45    1   12.0
Date                5 5547     NaN     NA     NA     NaN     NA  Inf   -Inf
TotalRainfall       6 5547  205.65 110.97  190.6  197.50 103.93    0  986.5
TotalRainfall30     7 5547   36.09  62.52    0.0   22.29   0.00    0  320.6
TotalRainfall60     8 5547   45.01  78.85    0.0   27.38   0.00    0  424.0
TotalRainfall120    9 5547   51.38  90.45    0.0   31.05   0.00    0  493.2
                 range  skew kurtosis   se
Station*          17.0 -0.24    -1.28 0.07
Region*            4.0 -0.39    -1.25 0.02
Year              43.0 -0.50    -0.99 0.17
Month*            11.0  0.00    -1.22 0.05
Date              -Inf    NA       NA   NA
TotalRainfall    986.5  0.85     1.51 1.49
TotalRainfall30  320.6  1.64     1.71 0.84
TotalRainfall60  424.0  1.71     2.10 1.06
TotalRainfall120 493.2  1.75     2.35 1.21

3 Map of Singapore

mpsz <- st_read(dsn = "data/geospatial", layer = "MPSZ-2019") %>% 
  st_transform(crs=3414)
Reading layer `MPSZ-2019' from data source 
  `C:\Vanessa\SMU\Term 4 - Visual Analytics & Applications\mvheng\Group11_VAP\EDA\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 332 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 103.6057 ymin: 1.158699 xmax: 104.0885 ymax: 1.470775
Geodetic CRS:  WGS 84
glimpse(mpsz)
Rows: 332
Columns: 7
$ SUBZONE_N  <chr> "MARINA EAST", "INSTITUTION HILL", "ROBERTSON QUAY", "JURON…
$ SUBZONE_C  <chr> "MESZ01", "RVSZ05", "SRSZ01", "WISZ01", "MUSZ02", "MPSZ05",…
$ PLN_AREA_N <chr> "MARINA EAST", "RIVER VALLEY", "SINGAPORE RIVER", "WESTERN …
$ PLN_AREA_C <chr> "ME", "RV", "SR", "WI", "MU", "MP", "WI", "WI", "SI", "SI",…
$ REGION_N   <chr> "CENTRAL REGION", "CENTRAL REGION", "CENTRAL REGION", "WEST…
$ REGION_C   <chr> "CR", "CR", "CR", "WR", "CR", "CR", "WR", "WR", "CR", "CR",…
$ geometry   <MULTIPOLYGON [m]> MULTIPOLYGON (((33222.98 29..., MULTIPOLYGON (…

Let’s take a look at the planning areas for the 5 regions.

tmap_mode("view")

tm_shape(mpsz) +
  tm_polygons(col = "REGION_N", palette = "Set2")+
  tm_layout(main.title = "Planning Area",
            main.title.position = "left",
            main.title.size = 1,
            legend.show = FALSE,
            frame = FALSE) +
  tmap_options(check.and.fix = TRUE) +
  tm_view(set.zoom.limits = c(11,12))

4 Rainfall analysis

4.1 Analyse rainfall using maps

Let’s map the station to the planning area (PA).

Show the code
station_to_PA <- c(
  "Admiralty" = "WOODLANDS",
  "Ang Mo Kio" = "ANG MO KIO",
  "Boon Lay (East)" = "BOON LAY",
  "Changi" = "CHANGI",
  "Choa Chu Kang (South)" = "CHOA CHU KANG",
  "Clementi" = "CLEMENTI",
  "East Coast Parkway" = "BEDOK",
  "Jurong (West)" = "JURONG WEST",
  "Khatib" = "YISHUN",
  "Marina Barrage" = "DOWNTOWN CORE",
  "Newton" = "NEWTON",
  "Pasir Panjang" = "PASIR PANJANG",
  "Paya Lebar" = "PAYA LEBAR",
  "Seletar" = "SELETAR",
  "Sembawang" = "SEMBAWANG",
  "Tai Seng" = "HOUGANG",
  "Tengah" = "TENGAH",
  "Tuas South" = "TUAS"
)

rainfall$PA <- station_to_PA[rainfall$Station]
rainfall <- rainfall[, c("PA", setdiff(names(rainfall), "PA"))]
head(rainfall)
# A tibble: 6 × 10
  PA        Station  Region  Year Month Date       TotalRainfall TotalRainfall30
  <chr>     <chr>    <chr>  <dbl> <ord> <date>             <dbl>           <dbl>
1 WOODLANDS Admiral… North   2009 Jan   2009-01-01           0.8               0
2 WOODLANDS Admiral… North   2009 Feb   2009-02-01         148                 0
3 WOODLANDS Admiral… North   2009 Mar   2009-03-01         348                 0
4 WOODLANDS Admiral… North   2009 Apr   2009-04-01         149.                0
5 WOODLANDS Admiral… North   2009 May   2009-05-01         206.                0
6 WOODLANDS Admiral… North   2009 Jun   2009-06-01          92                 0
# ℹ 2 more variables: TotalRainfall60 <dbl>, TotalRainfall120 <dbl>
rain_map <- rainfall %>% 
  group_by(PA, Station, Year) %>% 
  summarise(Annual_Rainfall = 
              sum(TotalRainfall, na.rm = TRUE)) %>%
  ungroup()

glimpse(rain_map)
Rows: 469
Columns: 4
$ PA              <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "ANG MO KIO"…
$ Station         <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio"…
$ Year            <dbl> 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, …
$ Annual_Rainfall <dbl> 830.6, 2849.2, 3050.4, 2579.6, 3240.0, 1961.4, 2018.4,…
mpsztemp <- left_join(mpsz, rain_map,
                         by = c("PLN_AREA_N" = "PA"))
glimpse(mpsztemp)
Rows: 3,485
Columns: 10
$ SUBZONE_N       <chr> "MARINA EAST", "INSTITUTION HILL", "ROBERTSON QUAY", "…
$ SUBZONE_C       <chr> "MESZ01", "RVSZ05", "SRSZ01", "WISZ01", "MUSZ02", "MPS…
$ PLN_AREA_N      <chr> "MARINA EAST", "RIVER VALLEY", "SINGAPORE RIVER", "WES…
$ PLN_AREA_C      <chr> "ME", "RV", "SR", "WI", "MU", "MP", "WI", "WI", "SI", …
$ REGION_N        <chr> "CENTRAL REGION", "CENTRAL REGION", "CENTRAL REGION", …
$ REGION_C        <chr> "CR", "CR", "CR", "WR", "CR", "CR", "WR", "WR", "CR", …
$ Station         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Marina Ba…
$ Year            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2010, 2011…
$ Annual_Rainfall <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1112.0, 23…
$ geometry        <MULTIPOLYGON [m]> MULTIPOLYGON (((33222.98 29..., MULTIPOLY…

Let’s plot the annual mean temperature distribution across Singapore.

tm_shape(mpsztemp) +
  tm_polygons(col = "Annual_Rainfall", 
              palette = "Blues", 
              style = "jenks") +
  tm_view(set.zoom.limits = c(11,12))
Note

It seems like the northern area of Singapore has a cooler mean temperature.

4.2 Rainfall Time Series

4.2.1 Overall - Rainfall Time Series

Show the code
gg <- ggplot(rainfall, aes(x = Date, y = TotalRainfall, 
                         color = factor(Year))) +
    geom_line(linewidth = 0.1) +
    geom_point(aes(text = paste0("Month:", Month, 
                                "<br>Total Rainfall:", TotalRainfall, "mm"))) +
    labs(x = "Year", y = "Monthly Total Rainfall (mm)", color = "Year",
         title = "Trend of Monthly Total Rainfall from 1981 to 2023", 
         subtitle = "Gentle trend line sloping upwards from 1981",
         caption = "Data from Meteorological Service Singapore website") +
    geom_smooth(method = "lm", 
                se = FALSE, color = "black") +
    theme_minimal() 

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text = 
                        paste0(gg$labels$title, "<br>", "<sup>", 
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")),
           showlegend = FALSE,
    annotations = list(text = gg$labels$caption,
                      xref = "paper", yref = "paper",
                      x = 1000, y = 24,
                      xanchor = "right", yanchor = "top",
                      showarrow = FALSE)) 

4.2.2 Rainfall Time Series by station

rain_station <- rainfall %>%
  group_by(Station, Year) %>%
  summarise(rain = sum(TotalRainfall, na.rm = TRUE)) %>%
  ungroup()

rain_station$mean_tooltip <- c(paste0(
  "Year: ", rain_station$Year,
  "\n Station: ", rain_station$Station,
  "\n Total Rainfall: ", rain_station$rain, "mm"))

line <- ggplot(data = rain_station,
               aes(x = Year,
                   y = rain,
                   group = Station,
                   color = Station,
                   data_id = Station)) +
  geom_line_interactive(size = 1.2,
                        alpha = 0.4) +
  geom_point_interactive(aes(tooltip = rain_station$mean_tooltip),
                         fill = "white",
                         size = 1,
                         stroke = 1,
                         shape = 21) +
  theme_classic() +
  ylab("Annual Total Rainfall(mm)") +
  xlab("Year") +
  ggtitle("Annual Total Rainfall") +
  theme(plot.title = element_text(size = 10),
        plot.subtitle = element_text(size = 8)) 

girafe(ggobj = line, 
       width_svg = 8,
       height_svg = 6 * 0.618,
       options = list(
         opts_hover(css = "stroke-width: 2.5; opacity: 1;"),
         opts_hover_inv(css = "stroke-width: 1;opacity:0.6;")))

4.3 Confidence Interval of Total Rainfall

Show the code
rain_yr_error <- rainfall %>%
  group_by(Year) %>%
  summarise(n = n(), rain = sum(TotalRainfall, na.rm = TRUE), 
            sd = sd(TotalRainfall, na.rm = TRUE)) %>%
  mutate(se = sd/sqrt(n-1)) %>% 
  ungroup()

model <- lm(rain ~ Year, rain_yr_error)
y_intercept = coef(model)[1] 
slope_coeff = coef(model)[2]
adjust_yintercept = slope_coeff * 1982 + y_intercept

gg <- ggplot(rain_yr_error) +
       geom_errorbar(aes(x = factor(Year), ymin = rain - 2.58 * se, 
                      ymax = rain + 2.58*se), 
                      width=0.2, colour="black", 
                      alpha=0.9, size=0.5) +
       geom_point(aes(x = factor(Year), y = rain, 
             text = paste0("Year:", `Year`, 
                          "<br>Total Rainfall:", round(rain, digits = 2),
                          "<br>95% CI:[", 
                          round((rain - 2.58 * se), digits = 2), ",",
                          round((rain + 2.58 * se), digits = 2),"]")),
             stat="identity", color="darkred", 
             size = 1.5, alpha = 1) +
       geom_abline(slope = round(slope_coeff, 4), 
                   intercept = adjust_yintercept,
                   untf = TRUE,
                   color = "blue",
                   linetype = "dashed")+
       geom_text(aes(x = 11, y = 27.8, colour = "blue",
                     label = paste0("Rainfall=", 
                                    round(slope_coeff, 4), "* Year ",
                                    round(y_intercept, 4)))) +
       labs (x = "Year", y = "Annual mean temperatures (°C)",
             title = "99% Confidence interval of annual total rainfall by year",
             subtitle = "From 1982 to 2023",
             caption = "Data from Meteorological Service Singapore website") +
       theme_minimal() + 
       theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
             plot.title = element_text(face = "bold", size = 12))

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text = 
                        paste0(gg$labels$title, "<br>", "<sup>", 
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")),
           showlegend = FALSE)
Note

We can observe that the total rainfall over the years have increased and the confidence intervals have narrowed.

4.4 Rainfall across the months

4.4.1 Box plot across the months

Show the code
gg <- ggplot(rainfall, 
       aes(x = factor(Month, levels = month.abb), y = TotalRainfall)) +
  geom_violin(color = "navy", fill = "lightblue") +
  geom_hline(data = rainfall, 
             aes(yintercept = mean(TotalRainfall, na.rm = TRUE)),
             linetype = "dashed", size = 1, colour = "brown") +
  geom_text(aes(x = 4.5, y = 27.3, 
                 label = paste0("Total Rainfall : ", 
                                round(sum(TotalRainfall,na.rm = TRUE),2), "mm")), 
            colour = "brown") +
  stat_summary(fun = mean, geom = "point", 
               shape = 20, size = 3, color = "orange",
               aes(text = paste0("Total Rainfall : ",
                                 round(after_stat(y), 2), "mm"))) +
  theme_minimal() +
  labs(title = "Monthly Total Rainfall across each month from 1981 to 2023",
       subtitle = "November to February are cooler as compared to the rest of the year",
        y = "Total Rainfall (mm)",
        x = "Month",
        caption = "Data from Meteorological Service Singapore website")

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text =
                        paste0(gg$labels$title, "<br>", "<sup>",
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")))
Note

We can observe that temperature in November to February are below the average temperature.

4.4.2 Heatmap across the months

Show the code
rain <- rainfall %>% 
        group_by(Year, Month) %>% 
        summarise(TRain = sum(TotalRainfall, na.rm = TRUE))

gg <- ggplot(rain, aes(factor(Month, levels = month.abb), factor(Year), 
                          fill = TRain)) + 
    geom_tile(color = "white",
              aes(text = paste0(Year, "-", Month,
                                "<br>Rainfall:", round(TRain, 2), "°C"))) + 
    theme_minimal() + 
    scale_fill_gradient(name = "Rainfall",
                        low = "sky blue", 
                        high = "dark blue") +
    labs(x = NULL, y = NULL, 
         title = "Total rainfall by year and month",
         subtitle = "Hotter in more months of 2023 as compared to the other years")

ggplotly(gg, tooltip = "text")
Note

We can observe that temperature in May and June are consistently high across the years.